{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build Speech data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from IPython.display import display\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7527, 10)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>wav_file</th>\n",
       "      <th>label</th>\n",
       "      <th>sig_mean</th>\n",
       "      <th>sig_std</th>\n",
       "      <th>rmse_mean</th>\n",
       "      <th>rmse_std</th>\n",
       "      <th>silence</th>\n",
       "      <th>harmonic</th>\n",
       "      <th>auto_corr_max</th>\n",
       "      <th>auto_corr_std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ses01F_script02_2_F000</td>\n",
       "      <td>7</td>\n",
       "      <td>0.003671</td>\n",
       "      <td>0.005739</td>\n",
       "      <td>0.004434</td>\n",
       "      <td>0.003640</td>\n",
       "      <td>0.018692</td>\n",
       "      <td>-0.008143</td>\n",
       "      <td>0.023179</td>\n",
       "      <td>0.133057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ses01F_script02_2_F001</td>\n",
       "      <td>7</td>\n",
       "      <td>0.006365</td>\n",
       "      <td>0.011155</td>\n",
       "      <td>0.007913</td>\n",
       "      <td>0.007850</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>-0.017120</td>\n",
       "      <td>0.094578</td>\n",
       "      <td>0.213759</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Ses01F_script02_2_F006</td>\n",
       "      <td>0</td>\n",
       "      <td>0.039659</td>\n",
       "      <td>0.067939</td>\n",
       "      <td>0.049930</td>\n",
       "      <td>0.046050</td>\n",
       "      <td>0.345018</td>\n",
       "      <td>-0.004605</td>\n",
       "      <td>3.441704</td>\n",
       "      <td>9.317455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Ses01F_script02_2_F007</td>\n",
       "      <td>4</td>\n",
       "      <td>0.014478</td>\n",
       "      <td>0.026941</td>\n",
       "      <td>0.018384</td>\n",
       "      <td>0.019687</td>\n",
       "      <td>0.422764</td>\n",
       "      <td>-0.011850</td>\n",
       "      <td>0.568261</td>\n",
       "      <td>1.928247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Ses01F_script02_2_F008</td>\n",
       "      <td>0</td>\n",
       "      <td>0.025271</td>\n",
       "      <td>0.054958</td>\n",
       "      <td>0.031571</td>\n",
       "      <td>0.044958</td>\n",
       "      <td>0.470019</td>\n",
       "      <td>-0.005120</td>\n",
       "      <td>2.529399</td>\n",
       "      <td>9.210082</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 wav_file  label  sig_mean   sig_std  rmse_mean  rmse_std  \\\n",
       "0  Ses01F_script02_2_F000      7  0.003671  0.005739   0.004434  0.003640   \n",
       "1  Ses01F_script02_2_F001      7  0.006365  0.011155   0.007913  0.007850   \n",
       "6  Ses01F_script02_2_F006      0  0.039659  0.067939   0.049930  0.046050   \n",
       "7  Ses01F_script02_2_F007      4  0.014478  0.026941   0.018384  0.019687   \n",
       "8  Ses01F_script02_2_F008      0  0.025271  0.054958   0.031571  0.044958   \n",
       "\n",
       "    silence  harmonic  auto_corr_max  auto_corr_std  \n",
       "0  0.018692 -0.008143       0.023179       0.133057  \n",
       "1  0.444444 -0.017120       0.094578       0.213759  \n",
       "6  0.345018 -0.004605       3.441704       9.317455  \n",
       "7  0.422764 -0.011850       0.568261       1.928247  \n",
       "8  0.470019 -0.005120       2.529399       9.210082  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>wav_file</th>\n",
       "      <th>label</th>\n",
       "      <th>sig_mean</th>\n",
       "      <th>sig_std</th>\n",
       "      <th>rmse_mean</th>\n",
       "      <th>rmse_std</th>\n",
       "      <th>silence</th>\n",
       "      <th>harmonic</th>\n",
       "      <th>auto_corr_max</th>\n",
       "      <th>auto_corr_std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ses01F_script02_2_F000</td>\n",
       "      <td>5</td>\n",
       "      <td>0.003671</td>\n",
       "      <td>0.005739</td>\n",
       "      <td>0.004434</td>\n",
       "      <td>0.003640</td>\n",
       "      <td>0.018692</td>\n",
       "      <td>-0.008143</td>\n",
       "      <td>0.023179</td>\n",
       "      <td>0.133057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ses01F_script02_2_F001</td>\n",
       "      <td>5</td>\n",
       "      <td>0.006365</td>\n",
       "      <td>0.011155</td>\n",
       "      <td>0.007913</td>\n",
       "      <td>0.007850</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>-0.017120</td>\n",
       "      <td>0.094578</td>\n",
       "      <td>0.213759</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Ses01F_script02_2_F006</td>\n",
       "      <td>0</td>\n",
       "      <td>0.039659</td>\n",
       "      <td>0.067939</td>\n",
       "      <td>0.049930</td>\n",
       "      <td>0.046050</td>\n",
       "      <td>0.345018</td>\n",
       "      <td>-0.004605</td>\n",
       "      <td>3.441704</td>\n",
       "      <td>9.317455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Ses01F_script02_2_F007</td>\n",
       "      <td>2</td>\n",
       "      <td>0.014478</td>\n",
       "      <td>0.026941</td>\n",
       "      <td>0.018384</td>\n",
       "      <td>0.019687</td>\n",
       "      <td>0.422764</td>\n",
       "      <td>-0.011850</td>\n",
       "      <td>0.568261</td>\n",
       "      <td>1.928247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Ses01F_script02_2_F008</td>\n",
       "      <td>0</td>\n",
       "      <td>0.025271</td>\n",
       "      <td>0.054958</td>\n",
       "      <td>0.031571</td>\n",
       "      <td>0.044958</td>\n",
       "      <td>0.470019</td>\n",
       "      <td>-0.005120</td>\n",
       "      <td>2.529399</td>\n",
       "      <td>9.210082</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 wav_file  label  sig_mean   sig_std  rmse_mean  rmse_std  \\\n",
       "0  Ses01F_script02_2_F000      5  0.003671  0.005739   0.004434  0.003640   \n",
       "1  Ses01F_script02_2_F001      5  0.006365  0.011155   0.007913  0.007850   \n",
       "6  Ses01F_script02_2_F006      0  0.039659  0.067939   0.049930  0.046050   \n",
       "7  Ses01F_script02_2_F007      2  0.014478  0.026941   0.018384  0.019687   \n",
       "8  Ses01F_script02_2_F008      0  0.025271  0.054958   0.031571  0.044958   \n",
       "\n",
       "    silence  harmonic  auto_corr_max  auto_corr_std  \n",
       "0  0.018692 -0.008143       0.023179       0.133057  \n",
       "1  0.444444 -0.017120       0.094578       0.213759  \n",
       "6  0.345018 -0.004605       3.441704       9.317455  \n",
       "7  0.422764 -0.011850       0.568261       1.928247  \n",
       "8  0.470019 -0.005120       2.529399       9.210082  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('data/audio_features.csv')\n",
    "df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]\n",
    "print(df.shape)\n",
    "display(df.head())\n",
    "\n",
    "# change 7 to 2\n",
    "df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('data/no_sample_df.csv')\n",
    "\n",
    "# oversample fear\n",
    "fear_df = df[df['label']==3]\n",
    "for i in range(30):\n",
    "    df = df.append(fear_df)\n",
    "\n",
    "sur_df = df[df['label']==4]\n",
    "for i in range(10):\n",
    "    df = df.append(sur_df)\n",
    "    \n",
    "df.to_csv('data/modified_df.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>wav_file</th>\n",
       "      <th>label</th>\n",
       "      <th>sig_mean</th>\n",
       "      <th>sig_std</th>\n",
       "      <th>rmse_mean</th>\n",
       "      <th>rmse_std</th>\n",
       "      <th>silence</th>\n",
       "      <th>harmonic</th>\n",
       "      <th>auto_corr_max</th>\n",
       "      <th>auto_corr_std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ses01F_script02_2_F000</td>\n",
       "      <td>5</td>\n",
       "      <td>0.010847</td>\n",
       "      <td>0.013290</td>\n",
       "      <td>0.010715</td>\n",
       "      <td>0.019386</td>\n",
       "      <td>0.024313</td>\n",
       "      <td>0.168625</td>\n",
       "      <td>0.000277</td>\n",
       "      <td>0.000468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ses01F_script02_2_F001</td>\n",
       "      <td>5</td>\n",
       "      <td>0.020306</td>\n",
       "      <td>0.027702</td>\n",
       "      <td>0.020774</td>\n",
       "      <td>0.042489</td>\n",
       "      <td>0.578112</td>\n",
       "      <td>0.166868</td>\n",
       "      <td>0.001141</td>\n",
       "      <td>0.000753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Ses01F_script02_2_F006</td>\n",
       "      <td>0</td>\n",
       "      <td>0.137206</td>\n",
       "      <td>0.178822</td>\n",
       "      <td>0.142271</td>\n",
       "      <td>0.252096</td>\n",
       "      <td>0.448783</td>\n",
       "      <td>0.169317</td>\n",
       "      <td>0.041644</td>\n",
       "      <td>0.032933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Ses01F_script02_2_F007</td>\n",
       "      <td>2</td>\n",
       "      <td>0.048793</td>\n",
       "      <td>0.069713</td>\n",
       "      <td>0.051051</td>\n",
       "      <td>0.107439</td>\n",
       "      <td>0.549911</td>\n",
       "      <td>0.167899</td>\n",
       "      <td>0.006873</td>\n",
       "      <td>0.006814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Ses01F_script02_2_F008</td>\n",
       "      <td>0</td>\n",
       "      <td>0.086686</td>\n",
       "      <td>0.144276</td>\n",
       "      <td>0.089184</td>\n",
       "      <td>0.246100</td>\n",
       "      <td>0.611379</td>\n",
       "      <td>0.169216</td>\n",
       "      <td>0.030604</td>\n",
       "      <td>0.032553</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 wav_file  label  sig_mean   sig_std  rmse_mean  rmse_std  \\\n",
       "0  Ses01F_script02_2_F000      5  0.010847  0.013290   0.010715  0.019386   \n",
       "1  Ses01F_script02_2_F001      5  0.020306  0.027702   0.020774  0.042489   \n",
       "6  Ses01F_script02_2_F006      0  0.137206  0.178822   0.142271  0.252096   \n",
       "7  Ses01F_script02_2_F007      2  0.048793  0.069713   0.051051  0.107439   \n",
       "8  Ses01F_script02_2_F008      0  0.086686  0.144276   0.089184  0.246100   \n",
       "\n",
       "    silence  harmonic  auto_corr_max  auto_corr_std  \n",
       "0  0.024313  0.168625       0.000277       0.000468  \n",
       "1  0.578112  0.166868       0.001141       0.000753  \n",
       "6  0.448783  0.169317       0.041644       0.032933  \n",
       "7  0.549911  0.167899       0.006873       0.006814  \n",
       "8  0.611379  0.169216       0.030604       0.032553  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emotion_dict = {'ang': 0,\n",
    "                'hap': 1,\n",
    "                'sad': 2,\n",
    "                'neu': 3,}\n",
    "\n",
    "# emotion_dict = {'ang': 0,\n",
    "#                 'hap': 1,\n",
    "#                 'exc': 2,\n",
    "#                 'sad': 3,\n",
    "#                 'fru': 4,\n",
    "#                 'fea': 5,\n",
    "#                 'sur': 6,\n",
    "#                 'neu': 7,\n",
    "#                 'xxx': 8,\n",
    "#                 'oth': 8}\n",
    "\n",
    "scalar = MinMaxScaler()\n",
    "df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7837, 10) (1960, 10)\n"
     ]
    }
   ],
   "source": [
    "x_train, x_test = train_test_split(df, test_size=0.20)\n",
    "\n",
    "x_train.to_csv('data/s2e/audio_train.csv', index=False)\n",
    "x_test.to_csv('data/s2e/audio_test.csv', index=False)\n",
    "\n",
    "print(x_train.shape, x_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define preprocessing functions for text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import unicodedata\n",
    "\n",
    "def unicodeToAscii(s):\n",
    "    return ''.join(\n",
    "        c for c in unicodedata.normalize('NFD', s)\n",
    "        if unicodedata.category(c) != 'Mn'\n",
    "    )\n",
    "\n",
    "# Lowercase, trim, and remove non-letter characters\n",
    "def normalizeString(s):\n",
    "    s = unicodeToAscii(s.lower().strip())\n",
    "    s = re.sub(r\"([.!?])\", r\" \\1\", s)\n",
    "    s = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", s)\n",
    "    return s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build Text data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10087"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "import os\n",
    "import pickle\n",
    "\n",
    "useful_regex = re.compile(r'^(\\w+)', re.IGNORECASE)\n",
    "\n",
    "file2transcriptions = {}\n",
    "\n",
    "for sess in range(1, 6):\n",
    "    transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)\n",
    "    transcript_files = os.listdir(transcripts_path)\n",
    "    for f in transcript_files:\n",
    "        with open('{}{}'.format(transcripts_path, f), 'r') as f:\n",
    "            all_lines = f.readlines()\n",
    "\n",
    "        for l in all_lines:\n",
    "            audio_code = useful_regex.match(l).group()\n",
    "            transcription = l.split(':')[-1].strip()\n",
    "            # assuming that all the keys would be unique and hence no `try`\n",
    "            file2transcriptions[audio_code] = transcription\n",
    "# save dict\n",
    "with open('data/t2e/audiocode2text.pkl', 'wb') as file:\n",
    "    pickle.dump(file2transcriptions, file)\n",
    "len(file2transcriptions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7837, 3) (1960, 3)\n"
     ]
    }
   ],
   "source": [
    "# Prepare text data\n",
    "text_train = pd.DataFrame()\n",
    "text_train['wav_file'] = x_train['wav_file']\n",
    "text_train['label'] = x_train['label']\n",
    "text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]\n",
    "\n",
    "text_test = pd.DataFrame()\n",
    "text_test['wav_file'] = x_test['wav_file']\n",
    "text_test['label'] = x_test['label']\n",
    "text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]\n",
    "\n",
    "text_train.to_csv('data/t2e/text_train.csv', index=False)\n",
    "text_test.to_csv('data/t2e/text_test.csv', index=False)\n",
    "\n",
    "print(text_train.shape, text_test.shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
